library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
df <- readr::read_csv("cs_1675_fall2021_finalproject.csv", col_names = TRUE)
## Rows: 1252 Columns: 11
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (1): m
## dbl (10): x1, x2, x3, x4, v1, v2, v3, v4, v5, output
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Take an initial look at the data and some basic statistics

df %>% glimpse ()
## Rows: 1,252
## Columns: 11
## $ x1     <dbl> 0.025878, 0.030768, 0.019325, 0.306212, 0.031296, 0.031073, 0.0~
## $ x2     <dbl> 0.255934, 0.261575, 0.020877, 0.033379, 0.259342, 0.027119, 0.0~
## $ x3     <dbl> 0.492830, 0.498460, 0.258360, 0.255385, 0.264387, 0.260915, 0.0~
## $ x4     <dbl> 0.012770, 0.055779, 0.012424, 0.056190, 0.056594, 0.055192, 0.0~
## $ v1     <dbl> 0.275651, 0.343204, 4.998508, 5.090153, 5.031107, 9.977407, 0.2~
## $ v2     <dbl> 0.033657, 0.027082, 0.030259, 0.052342, 0.517705, 0.532436, 1.0~
## $ v3     <dbl> 1.166214, 1.260579, 1.298285, 1.322005, 1.368195, 1.298797, 1.1~
## $ v4     <dbl> 0.408402, 0.664248, 0.412870, 0.652111, 0.533701, 0.857509, 0.6~
## $ v5     <dbl> 0.525226, 2.866343, 0.409007, 0.861594, 6.451933, 0.958574, 0.2~
## $ m      <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A"~
## $ output <dbl> 0.786, 0.730, 0.996, 0.326, 0.735, 0.954, 0.969, 0.986, 0.874, ~
df %>% summary ()
##        x1                 x2                 x3                 x4          
##  Min.   :0.003117   Min.   :0.001173   Min.   :0.003344   Min.   :0.001447  
##  1st Qu.:0.144825   1st Qu.:0.059488   1st Qu.:0.180333   1st Qu.:0.034732  
##  Median :0.278602   Median :0.170552   Median :0.263551   Median :0.055822  
##  Mean   :0.265192   Mean   :0.159282   Mean   :0.262674   Mean   :0.053189  
##  3rd Qu.:0.352812   3rd Qu.:0.238338   3rd Qu.:0.343556   3rd Qu.:0.072108  
##  Max.   :0.609092   Max.   :0.446306   Max.   :0.509710   Max.   :0.101868  
##        v1                  v2                 v3               v4         
##  Min.   : 0.003474   Min.   :0.002281   Min.   : 1.003   Min.   :0.01867  
##  1st Qu.: 3.335833   1st Qu.:0.334104   1st Qu.: 3.945   1st Qu.:0.30184  
##  Median : 5.137150   Median :0.515154   Median : 5.632   Median :0.48923  
##  Mean   : 5.079560   Mean   :0.503186   Mean   : 5.569   Mean   :0.49132  
##  3rd Qu.: 6.850576   3rd Qu.:0.684780   3rd Qu.: 7.189   3rd Qu.:0.67959  
##  Max.   :10.133807   Max.   :1.018897   Max.   :10.177   Max.   :0.97913  
##        v5                 m                 output      
##  Min.   : 0.006831   Length:1252        Min.   :0.0070  
##  1st Qu.: 2.439787   Class :character   1st Qu.:0.2517  
##  Median : 6.496589   Mode  :character   Median :0.4835  
##  Mean   : 5.867330                      Mean   :0.5311  
##  3rd Qu.: 9.328919                      3rd Qu.:0.8430  
##  Max.   : 9.999845                      Max.   :0.9990
ddf <- df %>% select(-m)
stats <- tibble::tibble (variable = names (ddf),
                         num_missing = map_dbl (ddf, ~ sum (is.na (.))),
                         num_unique = map_dbl (ddf, n_distinct),
                         min_value = map_dbl (ddf, min),
                         median_value = map_dbl (ddf, median),
                         max_value = map_dbl (ddf, max))
stats %>% knitr::kable (caption = "variables overview")
variables overview
variable num_missing num_unique min_value median_value max_value
x1 0 1245 0.003117 0.2786015 0.609092
x2 0 1250 0.001173 0.1705525 0.446306
x3 0 1250 0.003344 0.2635515 0.509710
x4 0 1235 0.001447 0.0558215 0.101868
v1 0 1252 0.003474 5.1371505 10.133807
v2 0 1249 0.002281 0.5151535 1.018897
v3 0 1252 1.002923 5.6319045 10.176830
v4 0 1252 0.018665 0.4892350 0.979126
v5 0 1252 0.006831 6.4965890 9.999845
output 0 690 0.007000 0.4835000 0.999000
df %>% select (- output) %>% distinct () %>% dim ()
## [1] 1252   10

First we will graph the response in respect to each of the x inputs.

df %>% ggplot (mapping = aes (x = x1, y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Based on this graph it looks like the response becomes worse as x1 becomes greater

The best x1 values based on this graph seem to fall between 0.1 to 0.3

df %>% ggplot (mapping = aes (x = x2, y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = x3, y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = x4, y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now for calculating and plotting the x5 calculated input, as well as calculating w, t, and z for other plots.

### calculate x5, z, w, and t.
dfT <- (df %>% tibble::as_tibble () %>% mutate (x5 = (1 - (x1 + x2 + x3 + x4)), w = (x2 / (x3 + x4)), z = ((x1 + x2) / (x4 + x5)), t = (v1 * v2)))

### plot output against x5
dfT %>% ggplot (mapping = aes (x = x5, y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Visualizing calculated w input.

dfT %>% ggplot (mapping = aes (x = w, y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Visualizing calculated z input.

dfT %>% ggplot (mapping = aes (x = z, y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now we will graph the response in respect to each of the v inputs.

df %>% ggplot (mapping = aes (x = v1, y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v2, y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v3, y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v4, y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v5, y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Plotting the output in terms of the calculated t input.

dfT %>% ggplot (mapping = aes (x = t, y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Try the same graphs above but now binned by machine used to produce mixture.

First we will graph the response in respect to each of the x inputs as before.

df %>% ggplot (mapping = aes (x = x1, y = output)) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = x2, y = output)) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = x3, y = output)) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = x4, y = output)) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

Now for visualizing the inputs derived by the x inputs binned by machine.

dfT %>% ggplot (mapping = aes (x = x5, y = output)) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

Visualizing calculated w input binned by machine.

dfT %>% ggplot (mapping = aes (x = w, y = output)) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

Visualizing calculated z input binned by machine.

dfT %>% ggplot (mapping = aes (x = z, y = output)) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

Now we will graph the response in respect to each of the v inputs.

df %>% ggplot (mapping = aes (x = v1, y = output)) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v2, y = output)) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v3, y = output)) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v4, y = output)) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v5, y = output)) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

Plotting the output in terms of the calculated t input binned by machine.

dfT %>% ggplot (mapping = aes (x = t, y = output)) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

Try plotting with logit transformation applied to the response.

df %>% ggplot (mapping = aes (x = x1, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = x2, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = x3, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = x4, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now for calculating and plotting the x5 calculated input, as well as calculating w, t, and z for other plots.

### calculate x5, z, w, and t.
dfT <- (df %>% tibble::as_tibble () %>% mutate (x5 = (1 - (x1 + x2 + x3 + x4)), w = (x2 / (x3 + x4)), z = ((x1 + x2) / (x4 + x5)), t = (v1 * v2)))

### plot output against x5
dfT %>% ggplot (mapping = aes (x = x5, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Visualizing calculated w input.

dfT %>% ggplot (mapping = aes (x = w, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Visualizing calculated z input.

dfT %>% ggplot (mapping = aes (x = z, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now we will graph the response in respect to each of the v inputs.

df %>% ggplot (mapping = aes (x = v1, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v2, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v3, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v4, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v5, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Plotting the output in terms of the calculated t input.

dfT %>% ggplot (mapping = aes (x = t, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now visualized the logistic response binned by machine as well.

df %>% ggplot (mapping = aes (x = x1, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = x2, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = x3, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = x4, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

Now for visualizing the inputs derived by the x inputs binned by machine.

dfT %>% ggplot (mapping = aes (x = x5, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

Visualizing calculated w input binned by machine.

dfT %>% ggplot (mapping = aes (x = w, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

Visualizing calculated z input binned by machine.

dfT %>% ggplot (mapping = aes (x = z, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

Now we will graph the response in respect to each of the v inputs.

df %>% ggplot (mapping = aes (x = v1, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v2, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v3, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v4, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

df %>% ggplot (mapping = aes (x = v5, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

Plotting the output in terms of the calculated t input binned by machine.

dfT %>% ggplot (mapping = aes (x = t, y = boot::logit (output))) + geom_point () + geom_smooth (method = lm) + facet_wrap (~m)
## `geom_smooth()` using formula 'y ~ x'

Let’s look for correlated features by looking at coupled interactions.

Let’s start with plotting wx1 against the output to see if they are correlated.

dfT %>% ggplot (mapping = aes (x = (w * x1), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now let’s do the same check for x5w.

dfT %>% ggplot (mapping = aes (x = (x5 * w), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now let’s do the same check for the values used to calculate w: x2, x3, and x4.

dfT %>% ggplot (mapping = aes (x = (x2 * w), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (x3 * w), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (x4 * w), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now check z for the same correlations.

dfT %>% ggplot (mapping = aes (x = (x1 * z), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (x2 * z), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (x3 * z), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (x4 * z), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (x5 * z), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now check x5 against the other four x inputs.

dfT %>% ggplot (mapping = aes (x = (x1 * x5), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (x2 * x5), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (x3 * x5), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (x4 * x5), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now let’s check the interactions between the plain x inputs.

dfT %>% ggplot (mapping = aes (x = (x2 * x1), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (x3 * x1), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (x4 * x1), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now with x2 instead of x1.

dfT %>% ggplot (mapping = aes (x = (x3 * x2), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (x4 * x2), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now x3 in the interaction instead of x2.

dfT %>% ggplot (mapping = aes (x = (x4 * x3), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now to look for correlations among the v inputs.

Check for correlations between the derived t input and the five v inputs.

dfT %>% ggplot (mapping = aes (x = (v1 * t), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (v2 * t), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (v3 * t), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (v4 * t), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (v5 * t), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now check v5 against the other four v inputs.

dfT %>% ggplot (mapping = aes (x = (v1 * v5), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (v2 * v5), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (v3 * v5), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (v4 * v5), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now let’s check the interactions between the plain v inputs.

dfT %>% ggplot (mapping = aes (x = (v2 * v1), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (v3 * v1), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (v4 * v1), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now with v2 instead of v1.

dfT %>% ggplot (mapping = aes (x = (v3 * v2), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

dfT %>% ggplot (mapping = aes (x = (v4 * v2), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'

Now v3 in the interaction instead of v2.

dfT %>% ggplot (mapping = aes (x = (v4 * v3), y = output)) + geom_point () + geom_smooth (method = lm)
## `geom_smooth()` using formula 'y ~ x'